from lxml import etree
import requests
import pandas as pd
BASE_DOMAIN='https://www.dytt8.net/'
# url='https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
HEADERS={
    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36'
}

def get_detail_url(url):
    response=requests.get(url,headers=HEADERS)
    # print(response.encoding)
    text=response.text.encode('ISO-8859-1').decode('gbk','ignore')
    # print(text)
    html=etree.HTML(text)
    detail_urls=html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls=map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls

def parse_detail_page(url):
    movie={}
    response=requests.get(url,headers=HEADERS)
    # text=response.content.decode('gbk')
    text = response.text.encode('ISO-8859-1').decode('gbk', 'ignore')
    html=etree.HTML(text)
    title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie['title']=title
    zoomE=html.xpath("//div[@id='Zoom']")[0]
    imgs=zoomE.xpath(".//img/@src")
    cover=imgs[0]
    movie['cover'] = cover
    if len(imgs)>1:
        screenshot=imgs[1]
        movie['screenshot']=screenshot
    else:
        movie['screenshot']=None


    def parse_info(info,rule):
        return  info.replace(rule,'').strip()

    infos=zoomE.xpath(".//text()")
    # print(type(infos))
    for index, info in enumerate(infos):
        # print(type(info))
        # if info.startwith
        if info.startswith("◎年　　代"):
            # info=info.replace("◎年　　代","").strip()
            info=parse_info(info,"◎年　　代")
            movie['year']=info
        elif info.startswith("◎产　　地"):
            # info=info.replace("◎产　　地","").strip()
            info = parse_info(info, "◎产　　地")
            movie['country']=info

        elif info.startswith("◎类　　别"):
            # info = info.replace("◎类　　别", "").strip()
            info = parse_info(info, "◎类　　别")
            movie['catagory'] = info
        elif info.startswith("◎豆瓣评分"):
            # info = info.replace("◎豆瓣评分", "").strip()
            info = parse_info(info, "◎豆瓣评分")
            movie['douban_rating'] = info
        elif info.startswith("◎片　　长"):
            # info=info.replace("◎类　　别","").strip()
            info = parse_info(info, "◎片　　长")
            movie['duration']=info
        elif info.startswith("◎导　　演"):
            info = parse_info(info, "导　　演")
            movie['director'] = info
        elif info.startswith("◎主　　演"):
            info=parse_info(info,"◎主　　演")
            actors=[info]
            for x in range(index+1,len(infos)):
                actor=infos[x].strip()
                if actor.startswith("◎"):
                    break
                actors.append(actor)
            movie['actors']=actors
        elif info.startswith("◎简  介"):
            for x in range(index+1,len(infos)):
                profile=infos[x].strip()
                movie['profile']=profile
    download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    movie['download_url']=download_url
    return movie



def spider():
    base_url='https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    movies=[]
    for x in range(1,8):
        url=base_url.format(x)
        detail_urls=get_detail_url(url)
        for detail_url in detail_urls:
            #第二个for，用来遍历一页中所有电影的详情url
            movie=parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)
        break
        # pd.DataFrame(movies).to_excel('movie.xls')
    df = pd.DataFrame(data=movies,
    columns=['title','cover','screenshot','year','catagory','douban_rating','director','actors','download_url'])
    df.to_csv('test1.csv')

if __name__ == '__main__':
    spider()

